//author Sylvain Bertrand <digital.ragnarok@gmail.com>
//Protected by GNU Affero GPL v3 with some exceptions.

//NOTES:
//This is raw, but very linear and simple radeon 3D pipeline programing.  The
//ISA (Instruction Set Architecture) documentation (see AMD web site) let you
//understand a lot of this programing. Reading the "official" code (drm + mesa)
//is an hundred fold more difficult.  Don't be afraid, it's not that hard.  Keep
//in mind: aligment constraints, unit of work size constraints.

#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <string.h>
#include <math.h>
#include <stdint.h>
#include <stdlib.h>
#include <error.h>
#include <sys/mman.h>
#include <endian.h>
#include <errno.h>

#include <linux/types.h>

#include <alga/pixel_fmts.h>
#include <alga/amd/dce6/dce6.h>
#include <alga/amd/si/ioctl.h>
#include <alga/amd/si/pkt.h>
#include <alga/amd/si/cps_regs.h>
#include <alga/amd/si/gpu_regs_cfg.h>
#include <alga/amd/si/gpu_regs_sh.h>
#include <alga/amd/si/gpu_regs_ctx.h>

static uint32_t set(uint32_t mask,uint32_t v)
{
        uint8_t shift;
        shift=ffs(mask)-1;
        return (v<<shift)&mask;
}

struct params_3d {
  uint64_t vs_gpu_addr;
  uint64_t ps_gpu_addr;
  uint64_t w;
  uint64_t h;
  uint64_t fb_gpu_addr;
};

#define e(m,...) error(0,0,m,##__VA_ARGS__)
#define o(m,...) printf(m "\n",##__VA_ARGS__)
#define ul unsigned long
#define ull unsigned long long

#define IB_DWS_N_MAX (16 * 64)

struct vertex {
  float position[4];
  float param0[4];
};

#define VERTICES_N 4
static struct vertex vertices[VERTICES_N]={
  {
    { -0.2f, -0.9f, 0.0f, 1.0f },
    { 1.0f, 0.0f, 0.0f, 1.0f }
  },
  {
    { -0.9f, 0.9f, 0.0f, 1.0f },
    { 0.0f, 1.0f, 0.0f, 1.0f }
  },
  {
    { 0.9f, 0.9f, 0.0f, 1.0f },
    { 0.0f, 0.0f, 1.0f, 1.0f }
  },
  {
    { 0, 0, 0, 0 },
    { 0, 0, 0, 0 }
  }
};

static uint32_t buf_res_descs[]={
  //init with the vram lower 32 bits vertex position buffer address
  0x00000000,
  //oring the upper 8 remaining bits of buffer address. 
  //stride=0x20 (8 floats (4 position+4 color components) of 4 bytes.
  0x00200000,
  //4 records, namely 4 vertices, the last one is "null"
  0x00000004,
  //dst_sel_x=4(x) dst_sel_y=5(y) dst_sel_z=6(z) dest_sel_w=7(w) 
  //nfmt=7(float) dfmt=14(32_32_32_32)
  0x00077fac,
  //----------------------------------------------------------------------------
  //init with the vram lower 32 bits vertex param 0 buffer address 
  0x00000000,
  //oring the upper 8 remaining bits of buffer address.
  //stride=0x20 (8 floats (4 position+4 param 0 components) of 4 bytes.
  0x00200000,
  //4 records, namely 4 vertices, the last one is "null" 
  0x00000004,
  //dst_sel_x=4(r) dst_sel_y=5(g) dst_sel_z=6(b) dst_sel_w=7(a)
  //(customary to use color terminology for params)
  //nfmt=7(float) dfmt=14(32_32_32_32)
  0x00077fac
};

// o USER_SGPR[3:0]<--buffer resouce descriptor of the buffer of vertex
//   positions
// o USER_SGPR[7:4]<--buffer resouce descriptor of the buffer of vertex
//   parameter 0 (unused here)
// note: the done bit in export instructions is only for vertex positions.
static const uint8_t vs_vgprs_n=9;
static const uint8_t vs_user_sgprs_n=8;
static const uint8_t vs_sgprs_n=8;//at least vs_user_sgprs_n
static const uint8_t vs_exported_params_n=1;
static  uint32_t vs[]={
  0xe00c2000,//buffer_load_format_xyzw idxen=1
  0x80000100,//                        soffset=128(=0) vdata=1
  0xbf8c0000,//s_waitcnt
  0xe00c2000,//buffer_load_format_xyzw idxen=1
  0x80010500,//            soffset=128(=0) srsrc=1(sgprs[4:7]) vdata=5
  0xbf8c0000,//s_waitcnt
  0xf80008cf,//export en=0b1111 done=1 tgt=12(pos0)
  0x04030201,//       vsrc0=1 vsrc1=2 vsrc2=3 vsrc3=4
  0xbf8c0000,//s_waitcnt
  0xf800020f,//export en=0b1111 tgt=32(param0)
  0x08070605,//       vsrc0=5 vsrc1=6 vsrc2=7 vsrc3=8
  0xbf8c0000,//s_waitcnt
  0xbf810000 //s_endpgm
};

//m0 is put by the spi right after the last user pre-loaded sgprs. m0 must
//be loaded in order to index properly the parameters in lds.
//note: we don't deal with the "valid mask" for pixer en exec register.
static const uint8_t ps_vgprs_n=4;
static const uint8_t ps_user_sgprs_n=0;
static const uint8_t ps_sgprs_n=0;//at least ps_user_sgprs_n
static uint32_t ps[]={
  0x7e0002f2,//v_mov_b32 src0=242(1.0f) vdst=0
  0xbf8c0000,//s_waitcnt
  0x7e0202f2,//v_mov_b32 src0=242(1.0f) vdst=1
  0xbf8c0000,//s_waitcnt */
  0x7e0402f2,//v_mov_b32 src0=242(1.0f) vdst=2 
  0xbf8c0000,//s_waitcnt
  0x7e0602f2,//v_mov_b32 src0=242(1.0f) vdst=3 
  0xbf8c0000,//s_waitcnt
  0x5e000300,//v_cvt_pkrtz_f16_f32 vdst=0 vsrc1=1 src0=256(vgpr0)
  0x5e020702,//v_cvt_pkrtz_f16_f32 vdst=1 vsrc1=3 src0=258(vgpr2)
  0xf8001c0f,//exp vm=1 done=1 compr=1 en=0x1111
  0x01000100,//    vsrc3=1 vsrc2=0 vsrc1=1 vsrc0=0
  0xbf8c0000,//s_waitcnt
  0xbf810000 //s_endpgm
};

//return the offset aligned on power of two order equal or above the offset
//argument
static uint64_t next_aligned_of(uint64_t of,uint64_t order)
{
  uint64_t blk_sz=(1<<order);
  uint64_t mask=blk_sz-1;
  if(of&mask) return (of+blk_sz)&~mask;
  else return of;
}

static void cpy_htole32(uint32_t *dst, uint32_t *src, uint64_t dws_n)
{
  while(1){
    if(!dws_n) break;
    *dst++=htole32(*src++);
    dws_n--;
  }
}

static uint32_t upper_32_bits(uint64_t x)
{
  return x>>32;
}

static uint32_t lower_32_bits(uint64_t x)
{
  return (uint32_t)x;
}

union f2u {
	float f;
	uint32_t u;
};
static inline uint32_t f2u(float f)
{
	union f2u tmp;

	tmp.f = f;
	return tmp.u;
}

#define ib_wr(x) *((*ib)++)=htole32(x)

static void prelude(uint32_t **ib)
{
  //----------------------------------------------------------------------------
  //sync shader read/write caches, read/write L1/L2texture caches, read caches
  //of color blocks (we don't use the depth block).
  ib_wr(PKT3(PKT3_SURF_SYNC,4));
  //CP_COHER_CTL_0
  ib_wr(CCC_SH_ICACHE_ACTION_ENA|CCC_SH_KCACHE_ACTION_ENA|CCC_TCL2_ACTION_ENA
		                                      |CCC_CB_ACTION_ENA);
  //CP_COHER_SZ
  ib_wr(0xffffffff);
  //CP_COHER_BASE
  ib_wr(0);
  ib_wr(0x0000000a);//polling interval, 0xa(10) * 16 clocks
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //seems mandatory at the start of a command stream
  ib_wr(PKT3(PKT3_CTX_CTL,2));
  ib_wr(0x80000000);
  ib_wr(0x80000000);
  //----------------------------------------------------------------------------
}

//Config reg programming, then, in theory, flushing before modifiying their
//values. If same value for *ALL* accel code, should go into the linux
//module to be set once and for all.
static void cfg(uint32_t **ib)
{
  //----------------------------------------------------------------------------
  //VGT (Vertex Grouper and Tesselator block)
  ib_wr(PKT3(PKT3_SET_CFG_REG,2));
  ib_wr(CFG_REG_IDX(VGT_PRIM_TYPE));
  //VGT_PRIM_TYPE
  ib_wr(set(VPT_PRIM_TYPE,VPT_TRILIST));
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //PA (Primitive Assembler) CL (CLipper)
  ib_wr(PKT3(PKT3_SET_CFG_REG,2));
  ib_wr(CFG_REG_IDX(PA_CL_ENHANCE));
  //PA_CL_ENHANCE
  ib_wr(set(PCE_CLIP_SEQ_N,3)|PCE_CLIP_VTX_REORDER_ENA);
  //----------------------------------------------------------------------------
}

static void ctx_misc_init(uint32_t **ib)
{
  //basic init GPU context, XXX: not using the CLR_CTX command ???
  ib_wr(PKT3(PKT3_SET_CTX_REG,14));
  ib_wr(CTX_REG_IDX(VGT_OUTPUT_PATH_CTL));
  //VGT_OUTPUT_PATH_CTL
  ib_wr(0);
  ///VGT_HOS_CTL
  ib_wr(0);
  ///VGT_HOS_MAX_TESS_LVL
  ib_wr(0);
  //VGT_HOS_MIN_TESS_LVL
  ib_wr(0);
  //VGT_HOS_REUSE_DEPTH
  ib_wr(0);
  //VGT_GROUP_PRIM_TYPE
  ib_wr(0);
  //VGT_GROUP_FIRST_DECR
  ib_wr(0);
  //VGT_GROUP_DECR
  ib_wr(0);
  //VGT_GROUP_VECT_0_CTL
  ib_wr(0);
  //VGT_GROUP_VECT_1_CTL
  ib_wr(0);
  //VGT_GROUP_VECT_0_FMT_CTL
  ib_wr(0);
  //VGT_GROUP_VECT_1_FMT_CTL
  ib_wr(0);
  //VGT_GS_MODE
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_PRIM_ID_ENA));
  //VGT_PRIM_ID_ENA
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_PRIM_ID_RESET));
  //VGT_PRIM_ID_RESET
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(VGT_STRMOUT_CFG));
  //VGT_STRMOUT_CFG
  ib_wr(0);
  //VGT_STRMOUT_BUF_CFG
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(IA_MULTI_VGT_PARAM));
  //IA_MULTI_VGT_PARAM
  ib_wr(IMVP_SWITCH_ON_EOP | IMVP_PARTIAL_VS_WAVE_ON
                                                 | set(IMVP_PRIM_GROUP_SZ, 63));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(VGT_REUSE_OFF));
  //VGT_REUSE_OFF
  ib_wr(0);
  //VGT_VTX_CNT_ENA
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_SHADER_STAGES_ENA));
  //VGT_SHADER_STAGES_ENA
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_CENTROID_PRIORITY_0));
  //PA_SC_CENTROID_PRIORITY_0
  ib_wr(0x76543210);
  //PA_SC_CENTROID_PRIORITY_1
  ib_wr(0xfedcba98);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_EQAA));
  //DB_EQAA
  ib_wr(0x00110000);
}

static void ctx_vgt(uint32_t **ib)
{
  //VGT (Vertex Grouper and Tesselator block)
  ib_wr(PKT3(PKT3_SET_CTX_REG,5));
  ib_wr(CTX_REG_IDX(VGT_MAX_VTX_IDX));
  //VGT_MAX_VTX_IDX
  ib_wr(~0);
  //VGT_MIN_VTX_IDX
  ib_wr(0);
  //VGT_IDX_OF
  ib_wr(0);
  //VGT_MULTI_PRIM_IB_RESET_IDX
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(VGT_MULTI_PRIM_IB_RESET_ENA));
  //VGT_MULTI_PRIM_IB_RESET_ENA
  ib_wr(0);
}

static void ctx_spi_sh_vs(uint32_t **ib,struct params_3d *params_3d)
{
  //setup specific for the vertex shader
  
  //Tell the spi to pre-load the buffer descriptors in user sgprs
  ib_wr(PKT3(PKT3_SET_SH_REG,9));
  ib_wr(SH_REG_IDX(SPI_SH_USER_DATA_VS_0));
  //SPI_SH_USER_DATA_VS_0
  ib_wr(buf_res_descs[0]);
  //SPI_SH_USER_DATA_VS_1
  ib_wr(buf_res_descs[1]);
  //SPI_SH_USER_DATA_VS_2
  ib_wr(buf_res_descs[2]);
  //SPI_SH_USER_DATA_VS_3
  ib_wr(buf_res_descs[3]);
  //SPI_SH_USER_DATA_VS_4
  ib_wr(buf_res_descs[4]);
  //SPI_SH_USER_DATA_VS_5
  ib_wr(buf_res_descs[5]);
  //SPI_SH_USER_DATA_VS_6
  ib_wr(buf_res_descs[6]);
  //SPI_SH_USER_DATA_VS_7
  ib_wr(buf_res_descs[7]);
  
  ib_wr(PKT3(PKT3_SET_SH_REG,5));
  ib_wr(SH_REG_IDX(SPI_SH_PGM_LO_VS));
  //SPI_SH_PGM_LO_VS
  ib_wr(lower_32_bits(params_3d->vs_gpu_addr>>8));
  //SPI_SH_PGM_HI_VS
  ib_wr(set(SSPHV_MEM_BASE,upper_32_bits(params_3d->vs_gpu_addr>>8)));
  //SPI_SH_PGM_RSRC_VS_0: the vgrs are allocated using units of 4 vgprs,
  //sgprs using units of 8 sgprs. Don't forget to book 2 additionnal
  //sgprs for vcc. Both counts are minus one unit.
  ib_wr(set(SSPRV_VGPRS,((vs_vgprs_n-1)/4))
                                       | set(SSPRV_SGPRS,((vs_sgprs_n+2)-1)/8));
  //SPI_SH_PGM_RSRC_VS_1: tell the spi the count of sgprs which are notx vcc.
  ib_wr(set(SSPRV_USER_SGPR,vs_user_sgprs_n));
  
  //our vertex shader export only the color as parameter
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_VS_OUT_CFG));
  //SPI_VS_OUT_CFG
  ib_wr(set(SVOC_VS_PARAM_EXPORT_COUNT,vs_exported_params_n-1));
  
  //The spi needs to be told what packing format is used by the vertex
  //shader to export the position.
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_SH_POS_FMT));
  //SPI_SH_POS_FMT
  ib_wr(set(SSPF_POS_0_EXPORT_FMT,SSPF_4COMP));
}

static void ctx_spi_sh_ps(uint32_t **ib,struct params_3d *params_3d)
{
  //setup specific for the pixel/fragment shader

  ib_wr(PKT3(PKT3_SET_SH_REG,5));
  ib_wr(SH_REG_IDX(SPI_SH_PGM_LO_PS));
  //SPI_SH_PGM_LO_PS
  ib_wr(lower_32_bits(params_3d->ps_gpu_addr>>8));
  //SPI_SH_PGM_HI_PS
  ib_wr(set(SSPHP_MEM_BASE,upper_32_bits(params_3d->ps_gpu_addr>>8)));
  //SPI_SH_PGM_RSRC_PS_0: we must account 1 additional sgpr for m0 since
  //which will be loaded in the sgpr right after the last user sgpr.
  ib_wr(set(SSPRP_VGPRS,((ps_vgprs_n-1)/4))
                                     | set(SSPRP_SGPRS,((ps_sgprs_n+1+2)-1)/8));
  //SPI_SH_PGM_RSRC_PS_1: same constrains than the vertex shaders
  //plus the fact the spi will load the m0 in the first sgpr after the
  //last user loaded sgpr, namely sgpr6 in this case.
  ib_wr(set(SSPRP_USER_SGPR,ps_sgprs_n));
  
  //tell the spi the pixel/fragment shader will need perpective center
  //interpolation data in input (mandatory or gpu hang)
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(SPI_PS_INPUT_ENA));
  //SPI_PS_INPUT_ENA
  ib_wr(SPIE_PERSP_CENTER_ENA);
  //SPI_PS_INPUT_ADDR
  ib_wr(SPIA_PERSP_CENTER_ENA);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_PS_IN_CTL));
  //SPI_PS_IN_CTL: 1 parameter to interpolate. Must have at least one 
  ib_wr(set(SPIC_INTERP_N,1));
  
  //don't care about z depth export
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_SH_Z_FMT));
  //SPI_SH_Z_FMT
  ib_wr(set(SSZF_Z_EXPORT_FMT,SSZF_ZERO));
  
  //only 1 input param on 32, then only SPI_PS_INPUT_CTL_00
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_PS_INPUT_CTL_00));
  //SPI_PS_INPUT_CTL_00
  ib_wr(0);
  
  //The spi sends the pixel color exported by a pixel/fragment shader to
  //a cb, it needs to be told about the special color packing format the
  //shader used.
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_SH_COLOR_FMT));
  //SPI_SH_COLOR_FMT
  ib_wr(set(SSCF_COLOR_0_EXPORT_FMT,SSCF_FP16_ABGR));
}

static void ctx_spi_sh(uint32_t **ib,struct params_3d *params_3d)
{
  //SH (SHader block)
  ctx_spi_sh_vs(ib,params_3d);
  ctx_spi_sh_ps(ib,params_3d);
}

static void ctx_spi(uint32_t **ib,struct params_3d *params_3d)
{
  //SPI (Shader Processor Interpolator)
  //disable the point primitive sprite
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_INTERPOL_CTL_0));
  //SPI_INTERPOL_CTL_0
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(SPI_BARYC_CTL));
  //SPI_BARYC_CTL: want 0 in working sample
  ib_wr(0);
  
  ctx_spi_sh(ib,params_3d);
}

static void ctx_pa_su(uint32_t **ib)
{
  //PA (Primitive Assembler) SU (Setup Unit)
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_VTX_CTL));
  //PA_SU_VTX_CTL: tells the PA (Primitive Assembler) SU (Setup Unit)
  //to place the(?) pixel at the center of the vertex?
  ib_wr(PSVC_PIX_CENTER);
  
  //setup for the PA (Primitive Assembler) SU (Setup Unit) for the
  //point/line primitive rendering: we do not render point
  //or line primitives.
  //Set it to 8 like in working samples
  ib_wr(PKT3(PKT3_SET_CTX_REG,4));
  ib_wr(CTX_REG_IDX(PA_SU_POINT_SZ));
  //PA_SU_POINT_SZ
  ib_wr(set(PSPS_H,8)|set(PSPS_W,8));
  //PA_SU_POINT_MINMAX
  ib_wr(set(PSPM_MIN,8)|set(PSPM_MAX,8));
  //PA_SU_LINE_CTL
  ib_wr(set(PSLC_W,8));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_POLY_OF_CLAMP));
  //PA_SU_POLY_OF_CLAMP: tell the PA (Primitive Assembler) SU
  //(Setup Unit) for polygon not to clamp something ?
  ib_wr(0);
  
  //related to the SC (Scan Converter)
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_SC_MODE_CTL));
  //PA_SU_SC_MODE_CTL: removed FACE to follow working samples
  ib_wr(set(PSSMC_POLY_MODE_FRONT_PTYPE,PSSMC_DRAW_TRIANGLES)
                          | set(PSSMC_POLY_MODE_BACK_PTYPE,PSSMC_DRAW_TRIANGLES)
                          | PSSMC_PROVOKING_VTX_LAST);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SU_PRIM_FILTER_CTL));
  //PA_SU_PRIM_FILTER_CTL
  ib_wr(0);
}

static void ctx_pa_cl(uint32_t **ib)
{
  //PA (Primitive Assembler) CL (CLipper)
  ib_wr(PKT3(PKT3_SET_CTX_REG,5));
  ib_wr(CTX_REG_IDX(PA_CL_GB_VERT_CLIP_ADJ));
  //disable GB (Guard Band) by setting those registers to 1.0f
  //PA_CL_GB_VERT_CLIP_ADJ
  ib_wr(f2u(1.0f));
  //PA_CL_GB_VERT_DISC_ADJ
  ib_wr(f2u(1.0f));
  //PA_CL_GB_HORZ_CLIP_ADJ
  ib_wr(f2u(1.0f));
  //PA_CL_GB_HORZ_DISC_ADJ
  ib_wr(f2u(1.0f));
  
  //define the way the PA (Primitive Assembler) CL (CLipper) will
  //behave regarding NAN (Not A Number) and INF (INFinity) values
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_CL_NANINF_CTL));
  //PA_CL_NANINF_CTL: to hardware default behaviour
  ib_wr(0);
  
  //no clipping done on the input from the vertex shader
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_CL_VS_OUT_CTL));
  //PA_CL_VS_OUT_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_CL_CLIP_CTL));
  //PA_CL_CLIP_CTL: ucp mode 3=always expand and clip as trifan
  ib_wr(set(PCCC_PS_UCP_MODE,3) | PCCC_DX_LINEAR_ATTR_CLIP_ENA);
}

static void ctx_pa_sc_vport_0_te(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) 0 TE
  //(Transform Engine)
  ib_wr(PKT3(PKT3_SET_CTX_REG,7));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_TE_X_SCALE));
  //PA_SC_VPORT_0_TE_X_SCALE
  ib_wr(f2u(params_3d->w/2.0f));
  //PA_SC_VPORT_0_TE_X_OF
  ib_wr(f2u(params_3d->w / 2.0f));
  //PA_SC_VPORT_0_TE_Y_SCALE
  ib_wr(f2u(params_3d->h / 2.0f));
  //PA_SC_VPORT_0_TE_Y_OF
  ib_wr(f2u(params_3d->h / 2.0f));
  //PA_SC_VPORT_0_TE_Z_SCALE: stick to working sample values
  ib_wr(f2u(0.5f));
  //PA_SC_VPORT_0_TE_Z_OF: stick to working sample values
  ib_wr(f2u(0.5f));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_TE_ZMIN));
  //PA_SC_VPORT_0_TE_ZMIN: min Z value from VPORT TE
  ib_wr(f2u(0.0f));
  //PA_SC_VPORT_0_TE_ZMAX: max Z value from VPORT TE
  ib_wr(f2u(1.0f));
}

static void ctx_pa_sc_vport_0(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) 0
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_0_SCISSOR_TL));
  //PA_SC_VPORT_0_SCISSOR_TL
  ib_wr(set(PSVST_X,0)|set(PSVST_Y,0));
  //PA_SC_VPORT_0_SCISSOR_BR
  ib_wr(set(PSVSB_X,params_3d->w)|set(PSVSB_Y,params_3d->h));

  ctx_pa_sc_vport_0_te(ib,params_3d);
}

static void ctx_pa_sc_vports_te(uint32_t **ib)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT) TE (Transform
  //Engine)
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_VPORT_TE_CTL));
  //PA_SC_VPORT_TE_CTL: no so called perpective division
  ib_wr(PSVTC_VPORT_X_SCALE_ENA|PSVTC_VPORT_X_OF_ENA|PSVTC_VPORT_Y_SCALE_ENA
              |PSVTC_VPORT_Y_OF_ENA|PSVTC_VPORT_Z_SCALE_ENA|PSVTC_VPORT_Z_OF_ENA
              |PSVTC_VTX_W0_FMT);
}

static void ctx_pa_sc_vports(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter) VPORT (ViewPORT)
  ctx_pa_sc_vport_0(ib,params_3d);
  ctx_pa_sc_vports_te(ib);
}
 
static void ctx_pa_sc(uint32_t **ib, struct params_3d *params_3d)
{
  //PA (Primitive Assembler) SC (Scan Converter)
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_MODE_CTL_0));
  //PA_SC_MODE_CTL_0
  ib_wr(0);
  //PA_SC_MODE_CTL_1
  ib_wr(0);
  
  //defines how to render the edge of primitives
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_EDGERULE));
  //PA_SC_EDGERULE
  ib_wr(0xaaaaaaaa);
 
  //---------------------------------------------------------------------------- 
  //Anti-Aliasing... probably
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_AA_CFG));
  //PA_SC_AA_CFG
  ib_wr(0);
  
  //do something AA related
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_AA_MASK_X0Y0_X1Y0));
  //PA_SC_AA_MASK_X0Y0_X1Y0
  ib_wr(0xffffffff);
  //PA_SC_AA_MASK_X0Y1_X1Y1
  ib_wr(0xffffffff);
  //---------------------------------------------------------------------------- 
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,10));
  ib_wr(CTX_REG_IDX(PA_SC_CLIPRECT_RULE));
  //PA_SC_CLIPRECT_RULE: no scissor required then clip rule is 0xffff (no specs
  //provided)
  ib_wr(set(PSCR_CLIP_RULE,0xffff));
  //PA_SC_CLIPRECT_0_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_0_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  //PA_SC_CLIPRECT_1_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_1_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  //PA_SC_CLIPRECT_2_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_2_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  //PA_SC_CLIPRECT_3_TL
  ib_wr(0);
  //PA_SC_CLIPRECT_3_BR
  ib_wr(set(PSCB_X,params_3d->w)|set(PSCB_Y,params_3d->h));
  
  //---------------------------------------------------------------------------- 
  //Tells the SC (Scan Converter/rasteriser) we don't use line stipple since we
  //do not render line primitives.  XXX: ORed register? Because if not will set
  //all bits to 0!  We only want to set to 0 LINE_STIPPLE_ENA.
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_LINE_STIPPLE));
  //PA_SC_LINE_STIPPLE */
  ib_wr(0);
  
  //Even if we are not rendering line primitives, tells the PA (Primitive
  //Assembler) SC (scan converter/rasteriser) to do "something with the last
  //pixel
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(PA_SC_LINE_CTL));
  //PA_SC_LINE_CTL
  ib_wr(PSLC_LAST_PIXEL);
  //---------------------------------------------------------------------------- 
  
  //---------------------------------------------------------------------------- 
  //set the value of the scissors
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_GENERIC_SCISSOR_TL));
  //PA_SC_GENERIC_SCISSOR_TL
  ib_wr(set(PSGST_X,0)|set(PSGST_Y,0));
  //PA_SC_GENERIC_SCISSOR_BR
  ib_wr(set(PSGSB_X,params_3d->w)|set(PSGSB_Y,params_3d->h));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(PA_SC_SCREEN_SCISSOR_TL));
  //PA_SC_SCREEN_SCISSOR_TL
  ib_wr(set(PSSST_X,0)|set(PSSST_Y,0));
  //PA_SC_SCREEN_SCISSOR_BR
  ib_wr(set(PSSSB_X,params_3d->w)|set(PSSSB_Y,params_3d->h));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,4));
  ib_wr(CTX_REG_IDX(PA_SC_WND_OF));
  //PA_SC_WND_OF: the window offset in the screen which can be used by many
  //scissors.
  ib_wr(0);
  //PA_SC_WND_SCISSOR_TL
  ib_wr(set(PSWST_X,0)|set(PSWST_Y,0));
  //PA_SC_WND_SCISSOR_BR
  ib_wr(set(PSWSB_X,params_3d->w)|set(PSWSB_Y,params_3d->h));
  //---------------------------------------------------------------------------- 
  
  ctx_pa_sc_vports(ib,params_3d);
}

static void ctx_pa(uint32_t **ib,struct params_3d *params_3d)
{
  //PA (Primitive Assembler)
  ctx_pa_su(ib);
  ctx_pa_cl(ib);
  ctx_pa_sc(ib,params_3d);
}

static void ctx_dbs(uint32_t **ib)
{
  //DBs (Depth Blocks)
  //disable the depth stencil/z-buffer
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(DB_Z_INFO));
  //DB_Z_INFO
  ib_wr(0);
  //DB_STENCIL_INFO
  ib_wr(0);
  
  //even if disabled, setup some clean values in a few regs
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_DEPTH_CTL));
  //DB_DEPTH_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,5));
  ib_wr(CTX_REG_IDX(DB_DEPTH_BOUNDS_MIN));
  //DB_DEPTH_BOUNDS_MIN
  ib_wr(0);
  //DB_DEPTH_BOUNDS_MAX
  ib_wr(0);
  //DB_STENCIL_CLR
  ib_wr(0);
  //DB_DEPTH_CLR
  ib_wr(f2u(1.0f));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_RENDER_CTL));
  //DB_RENDER_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_RENDER_OVERRIDE_0));
  //DB_RENDER_OVERRIDE_0
  ib_wr(set(DRO_FORCE_HIZ_ENA,DRO_FORCE_DIS)
                                         |set(DRO_FORCE_HIS_ENA_0,DRO_FORCE_DIS)
                                                       |set(DRO_FORCE_HIS_ENA_1,
                                                                DRO_FORCE_DIS));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_STENCIL_CTL));
  //DB_STENCIL_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,4));
  ib_wr(CTX_REG_IDX(DB_SRESULTS_CMP_STATE_0));
  //DB_SRESULTS_CMP_STATE_0
  ib_wr(0);
  //DB_SRESULTS_CMP_STATE_1
  ib_wr(0);
  //DB_PRELOAD_CTL
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_ALPHA_TO_MASK));
  //DB_ALPHA_TO_MASK
  ib_wr(set(DATM_ALPHA_TO_MASK_OF_0,2)|set(DATM_ALPHA_TO_MASK_OF_1,2)
                |set(DATM_ALPHA_TO_MASK_OF_2,2)|set(DATM_ALPHA_TO_MASK_OF_3,2));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(DB_STENCILREFMASK));
  //DB_STENCILREFMASK
  ib_wr(0);
  //DB_STENCILREFMASK_BF
  ib_wr(0);
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(DB_SH_CTL));
  //DB_SH_CTL
  ib_wr(set(DSC_Z_ORDER,DSC_EARLY_Z_THEN_LATE_Z));
}

static void ctx_cbs_blend(uint32_t **ib)
{
  //blend blocks of CBs (Color Blocks)
  ib_wr(PKT3(PKT3_SET_CTX_REG,9));
  ib_wr(CTX_REG_IDX(CB_0_BLEND_CTL));
  //CB_0_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_1_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_2_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_3_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_4_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_5_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_6_BLEND_CTL: disable blending
  ib_wr(0);
  //CB_7_BLEND_CTL: disable blending
  ib_wr(0);
}

static void ctx_cb_0(uint32_t **ib,struct params_3d *params_3d)
{
  //CB 0 (Color Block 0)
  ib_wr(PKT3(PKT3_SET_CTX_REG,7));
  ib_wr(CTX_REG_IDX(CB_0_COLOR_BASE));
  //CB_0_COLOR_BASE
  ib_wr(params_3d->fb_gpu_addr>>8);
  //CB_0_COLOR_PITCH: a thin1 tile is 8x8 pixels
  ib_wr(set(CCP_TILE_MAX,params_3d->w/8-1)); 
  //CB_0_COLOR_SLICE: a thin1 tile is 8x8 pixels
  ib_wr(set(CCS_TILE_MAX,params_3d->w*params_3d->h/64-1));
  //CB_0_COLOR_VIEW: 0, or last tile index for an array of slices
  ib_wr(0);
  //CB_0_COLOR_INFO: for sRGB color space, in 8 bits little endian argb, the
  //color component swap is ALT for the color components from the pixel/fragment
  //shader and value must be clamped before and after blending to mrt range.
  ib_wr(set(CCI_ENDIAN,CCI_ENDIAN_NONE)|set(CCI_FMT,CCI_COLOR_8_8_8_8)
         |set(CCI_COMP_SWAP, CCI_SWAP_ALT)|set(CCI_NUMBER_TYPE,CCI_NUMBER_UNORM)
                                                              |CCI_BLEND_CLAMP);
  //CB_0_COLOR_ATTRIB: see gpu/tiling.c
  ib_wr(set(CCA_TILE_MODE_IDX,8));
}

static void ctx_cbs(uint32_t **ib,struct params_3d *params_3d)
{
  //CBs (Color Blocks)
  ctx_cbs_blend(ib);
  
  ctx_cb_0(ib,params_3d);
  
  //do enable all color components (RGBA) from the pixel/fragment shader to be
  //used by the CB 0 and do enable CB 0 to output all computed color components
  //to target (here our framebuffer)
  ib_wr(PKT3(PKT3_SET_CTX_REG,3));
  ib_wr(CTX_REG_IDX(CB_TGT_MASK));
  //CB_TGT_MASK */
  ib_wr(set(CTM_TGT_0_ENA,CTM_TGT_RED|CTM_TGT_GREEN|CTM_TGT_BLUE
                                                               |CTM_TGT_ALPHA));
  //CB_SH_MASK
  ib_wr(set(CSM_OUTPUT_0_ENA,CSM_OUTPUT_RED|CSM_OUTPUT_GREEN|CSM_OUTPUT_BLUE
                                                            |CSM_OUTPUT_ALPHA));
  
  ib_wr(PKT3(PKT3_SET_CTX_REG,2));
  ib_wr(CTX_REG_IDX(CB_COLOR_CTL));
  //CB_COLOR_CTL: switch normal mode for all CBs
  ib_wr(set(CCC_MODE,CCC_CB_NORMAL)|set(CCC_ROP3,CCC_0XCC));
}

static void ctx(uint32_t **ib,struct params_3d *params_3d)
{
  ctx_misc_init(ib);
  ctx_vgt(ib);
  ctx_spi(ib,params_3d);
  ctx_pa(ib,params_3d);
  ctx_dbs(ib);
  ctx_cbs(ib,params_3d);
}

static void draw(uint32_t **ib)
{
  ib_wr(PKT3(PKT3_IDX_TYPE,1));
  ib_wr(set(PKT3_SZ,PKT3_16BITS));

  ib_wr(PKT3(PKT3_INST_N,1));
  ib_wr(1);

  ib_wr(PKT3(PKT3_DRAW_IDX_AUTO,2));
  //3 indices to generate
  ib_wr(VERTICES_N-1); 
  //VGT_DRAW_INITIATOR
  ib_wr(set(VDI_SRC_SELECT,VDI_AUTO_IDX));
}

static void pfp_align(uint32_t **ib,uint32_t *ib_start)
{
  while(((*ib-ib_start)&CP_RING_PFP_DW_MASK)!=0) ib_wr(PKT2);
}

static uint64_t ib_3d(uint32_t *ib_start, struct params_3d *params_3d)
{
  uint32_t *ib=ib_start;
  prelude(&ib);
  //============================================================================
  //the real thing is here
  cfg(&ib);
  ctx(&ib,params_3d);
  //============================================================================
  draw(&ib);
  pfp_align(&ib,ib_start);
  return ib-ib_start;
}

//We prepare a big buffer with everything cpu side, then dma it to gpu vram, run
//it, and wait for a fence.
int main(int argc, char *argv[])
{
  int r0=0;
  int r1=0;
  //----------------------------------------------------------------------------
  //arguments
  if(argc<4){
    e("missing arguments");
    r0=EXIT_FAILURE;
    goto exit;
  }
  uint64_t fb_gpu_addr=strtoul(argv[1],NULL,16);
  uint64_t w=strtoul(argv[2],NULL,10);
  uint64_t h=strtoul(argv[3],NULL,10);
  //----------------------------------------------------------------------------

  o("drawing a triangle:fb=0x%016Lx,w=%Lu;h=%Lu",(ull)fb_gpu_addr,(ull)w,
                                                                        (ull)h);

  //----------------------------------------------------------------------------
  errno=0;
  int f=open("/dev/si0",O_RDWR);
  if(f==-1){
    e("open failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto exit;
  }
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //pre-compute aligned offsets and the aligned size of our vram buffer

  //vertex shader must be 256 bytes aligned (order 8), 0 since we will allocate
  //a 256 bytes aligned buffer.
  uint64_t vs_of=0;
  //pixel/fragment shader must be 256 bytes aligned (order 8)
  uint64_t ps_of=next_aligned_of(sizeof(vs),8);
  //vertices are fetch by block of 4 vertices, dw aligned (order 2)
  uint64_t vertices_of=next_aligned_of(ps_of+sizeof(ps),2);
  //ib is aligned on prefetch size which is 16 dws (order 6)
  uint64_t ib_of=next_aligned_of(vertices_of+sizeof(vertices),6);

  //worst alignment is 256 bytes (order 8), then round up for allocation
  uint64_t vram_buf_sz=next_aligned_of(ib_of+(IB_DWS_N_MAX<<2),8);
  o("vs_of=0x%016llx ps_of=0x%016llx vertices_of=0x%016llx ib_of=0x%016llx vram_buf_sz=0x%016llx",
            (ull)vs_of,(ull)ps_of,(ull)vertices_of,(ull)ib_of,(ull)vram_buf_sz);
  //----------------------------------------------------------------------------
 
  //----------------------------------------------------------------------------
  o("allocating 256 bytes aligned vram buffer...");
  struct si_mem mem;
  mem.align=256;//worst alignment is vs and ps
  mem.sz=vram_buf_sz;
  errno=0;
  ul req=_IOWR('d',SI_MEM_ALLOC,mem);
  r1=ioctl(f,req,&mem);
  if(r1==-1){
    e("alloc vram buffer failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto exit;
  }
  o("vram_buf_gpu_addr=0x%016llx",(ull)mem.gpu_addr);
  o("allocating 256 bytes aligned vram buffer done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //now, we have enough info to init the parameters for the pipeline
  struct params_3d params_3d;
  params_3d.vs_gpu_addr=mem.gpu_addr+vs_of;
  params_3d.ps_gpu_addr=mem.gpu_addr+ps_of;
  params_3d.w=w;
  params_3d.h=h;
  params_3d.fb_gpu_addr=fb_gpu_addr;
  o("params_3d:vs_gpu_addr=0x%016llx ps_gpu_addr=0x%016llx w=%llu h=%llu fb_gpu_addr=0x%016llx",
            (ull)params_3d.vs_gpu_addr,(ull)params_3d.ps_gpu_addr,(ull)w,(ull)h,
                                                    (ull)params_3d.fb_gpu_addr);
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("mmaping an aperture...");
  //get an aperture of the size of our vram buffer for dma
  errno=0;
  void *dma_buffer=mmap(NULL,vram_buf_sz,PROT_READ|PROT_WRITE,MAP_SHARED,f,0);
  if(dma_buffer==MAP_FAILED){
    e("unable to mmap an aperture buffer:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  }
  o("dma buffer=%p",dma_buffer);
  o("mmaping an aperture done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  //configure buffer resources
  uint64_t vtx_buf_gpu_addr=mem.gpu_addr+vertices_of;
  o("buffer resources:vtx_buf_gpu=0x%016llx",(ull)vtx_buf_gpu_addr);

  //vertex position buffer start address
  buf_res_descs[0]=lower_32_bits(vtx_buf_gpu_addr); 
  buf_res_descs[1]|=upper_32_bits(vtx_buf_gpu_addr); 
  //vertex color buffer start address
  buf_res_descs[4]=lower_32_bits(vtx_buf_gpu_addr+VERTICES_N*sizeof(float));
  buf_res_descs[5]|=upper_32_bits(vtx_buf_gpu_addr+VERTICES_N*sizeof(float));
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("copying static data into dma buffer...");
  cpy_htole32(dma_buffer+vs_of,&vs[0],sizeof(vs)>>2);
  cpy_htole32(dma_buffer+ps_of,&ps[0],sizeof(ps)>>2);
  cpy_htole32(dma_buffer+vertices_of,(uint32_t*)&vertices[0],
                                                           sizeof(vertices)>>2);
  o("copying static data into dma buffer done");
  //----------------------------------------------------------------------------

  //============================================================================
  //there, we program the 3D pipeline
  uint64_t ib_dws_n=ib_3d(dma_buffer+ib_of,&params_3d);
  o("ib_dws_n=0x%016llx(max=0x%016llx)",(ull)ib_dws_n,(ull)IB_DWS_N_MAX);
  //============================================================================
 
  //----------------------------------------------------------------------------
  o("dma-ing the cpu buffer to vram buffer...");
  struct si_dma dma;
  struct si_dma_l2l *l2l=&dma.params.l2l;
  struct si_timeouts_info *t_info=&dma.t_info;
  dma.type=SI_DMA_TYPE_L2L;
  dma.dir=SI_DMA_TO_DEVICE;
  //we don't really care here lets put one seconde!
  t_info->ring.n_max=1;
  t_info->ring.us=1000000;
  t_info->fence.n_max=1;
  t_info->fence.us=1000000;
  l2l->src_addr=(uint64_t)dma_buffer;
  l2l->dst_addr=mem.gpu_addr;
  l2l->sz=vram_buf_sz;
  req=_IOW('d',SI_DMA,dma);
  errno=0;
  r1=ioctl(f,req,&dma);
  switch(r1){
  case -1:
    e("dma l2l failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  case SI_RING_TIMEOUT:
    e("dma l2l failed:ring timeout");
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  case SI_FENCE_TIMEOUT:
    e("dma l2l failed:fence timeout");
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  }
  o("dma-ing the cpu buffer to vram buffer done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("running the ib...");
  struct si_gpu_3d_ib gpu_3d_ib;
  struct si_timeout_info *ring_t_info=&gpu_3d_ib.ring_t_info;
  //we don't really care here lets put one seconde!
  ring_t_info->n_max=1;
  ring_t_info->us=1000000;
  gpu_3d_ib.gpu_addr=mem.gpu_addr+ib_of;
  gpu_3d_ib.dws_n=ib_dws_n;
  req=_IOW('d',SI_GPU_3D_IB,gpu_3d_ib);
  errno=0;
  r1=ioctl(f,req,&gpu_3d_ib);
  switch(r1){
  case -1:
    e("running the GPU_3D indirecting buffer failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  case SI_RING_TIMEOUT:
    e("running the GPU_3D indirecting buffer failed:ring timeout");
    r0=EXIT_FAILURE;
    goto free_vram_buf;
  }
  o("running the ib done");
  //----------------------------------------------------------------------------

  //----------------------------------------------------------------------------
  o("fencing...");
  struct si_gpu_3d_fence gpu_3d_fence;
  t_info=&gpu_3d_fence.t_info;
  //we don't really care here lets put one seconde!
  t_info->ring.n_max=1;
  t_info->ring.us=1000000;
  t_info->fence.n_max=1;
  t_info->fence.us=1000000;
  req=_IOW('d',SI_GPU_3D_FENCE,gpu_3d_fence);
  errno=0;
  r1=ioctl(f,req,&gpu_3d_fence);
  switch(r1){
  case -1:
    e("waiting for fence failed:%s",strerror(errno));
    r0=EXIT_FAILURE;
    break;
  case SI_RING_TIMEOUT:
    e("waiting for fence failed:ring timeout");
    r0=EXIT_FAILURE;
    break;
  case SI_FENCE_TIMEOUT:
    e("waiting for fence failed:fence timeout");
    r0=EXIT_FAILURE;
    break;
  }
  o("fencing done");
  //----------------------------------------------------------------------------

free_vram_buf:
  //----------------------------------------------------------------------------
  o("freeing vram buffer...");
  req=_IOW('d',SI_MEM_FREE,mem.gpu_addr);
  errno=0;
  r1=ioctl(f,req,&mem.gpu_addr);
  if(r1==-1){
    e("free vram buffer failed (LEAK!):%s",strerror(errno));
    r0=EXIT_FAILURE;
  }
  o("freeing vram buffer done");
  //----------------------------------------------------------------------------
exit:
  return r0;
}
